#!/usr/bin/env python
#coding=utf-8



"""
This file is used to extract keyword 
"""


import re
import os
import sys

from structure import wordStruct

from module import FIGModule
from module import TAGModule
from module import ConfigModule

import PUBLICMETHOD



#system default setting
reload(sys)
sys.setdefaultencoding('utf-8')

class Keyword:
	def __init__(self, content, weight):
		self.__content = content
		self.__weight = weight

	def getContent(self):
		return self.__content

	def getWeight(self):
		return self.__weight

	def setWeight(self, weight):
		self.__weight = weight
		

class Extract:

	def __init__(self, config):
		#self.__fig = self.__caption = self.__title = self.__fileName = self.__no = ""
		self.__caption = ""
		self.__filteredKeywordList = []
		#self.__figList = simulateDatabase.FIGModule.FigList()		
		self.__captionQueue = wordStruct.WordQueue()
		#self.__fig = simulateDatabase.FIGModule.Fig()
		
		#self.__figIndexList = figIndexList
		self.__config = config
	
	def setFigCaption(self, caption):
		self.__caption = caption
	
	"""
	def getLowerCaseContent(self):
		self.__title = self.__title.lower()
		self.__caption = self.__caption.lower()
		self.__fileName = self.__fileName.lower()
	"""


	def addtionalWordDel(self):
		#use regular expression


		#delete the punctuation
		#delete some single alphas
		self.__caption = re.sub(r'\W[a-zA-Z]\W',' ',self.__caption)
		#delete all the word only contain number or "." or "-"
		self.__caption = re.sub(r'\W\-*[\d\.]+\W',' ',self.__caption)
		#delete all the "(" or ")" or "[" or "]" or "{" or "}" and the "," or "." behind them
		self.__caption = re.sub(r'[\(\)\[\]\{\}][\,\.]?','',self.__caption)
		#delete some consecutive punctuation with one blank behind
		self.__caption = re.sub(r'\W+\s',' ',self.__caption)
		#delete some punctuation with one blank before and some alpha behind
		self.__caption = re.sub(r'(?<=\s)\W(?=\w)','',self.__caption)
		#delete the punctuation in the end of a sentence
		self.__caption = self.__caption[:-1]

		#print self.__fig.getCaption()

	def splitWord(self):
		captionContent = self.__caption.split()
		for data in captionContent:
			self.__captionQueue.addWordWithFirstAlph(data, data[0:1])
		del captionContent
		#print self.__captionQueue.getWordQueueContent()

	def deleteWord(self):
		delWordDic = self.__config.getDelWordDic()
		delWordDicIndex = self.__config.getDelWordDicIndexStr()
		numOfWord = self.__captionQueue.getCountWord()

		#use i to iterate the caption word
		i = 0
		
		while  i < numOfWord:
			#this progress is word level

			#use j to iterate the delWordDic
			j = 0
			firstAlph = self.__captionQueue.getWordQueue()[i].getWordFirstAlph().lower()

			#if find out the first alph of the caption word is the same as the delWordDicIndex
			if delWordDicIndex.find(firstAlph) != -1:

				#use j to iterate the delWordDic
				while j < delWordDic[ord(firstAlph)].getCountWord():

					if self.__captionQueue.getWordQueue()[i].getWordContent() == delWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent()\
					or self.__captionQueue.getWordQueue()[i].getWordContent() == delWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent().title():
						"""
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						print self.__captionQueue.getWordQueue()[i].getWordContent()
						print delWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent()
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						"""
						self.__captionQueue.deleteWordBySeqNo(i)
						numOfWord -= 1
						i -= 1
						break

					j += 1
			i += 1

		#print self.__captionQueue.getWordQueueContent()

	def setWordWei(self):
		weiWordDic = self.__config.getWeiWordDic()
		weiWordDicIndex = self.__config.getWeiWordDicIndexStr()
		numOfWord = self.__captionQueue.getCountWord()

		#use i to iterate the caption word
		i = 0
		
		while  i < numOfWord:
			#this progress is word level

			#use j to iterate the weiWordDic
			j = 0

			firstAlph = self.__captionQueue.getWordQueue()[i].getWordFirstAlph().lower()
			
			#print firstAlph

			#if find out the first alph of the caption word is the same as the weiWordDicIndex
			if weiWordDicIndex.find(firstAlph) != -1:

				#use j to iterate the weiWordDic
				while j < weiWordDic[ord(firstAlph)].getCountWord():

					if self.__captionQueue.getWordQueue()[i].getWordContent() == weiWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent()\
					or self.__captionQueue.getWordQueue()[i].getWordContent() == weiWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent().title():
						"""
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						print self.__captionQueue.getWordQueue()[i].getWordContent()
						print self.__config.getWeiWordDic()[ord(firstAlph)].getWordQueue()[j].getWordContent()
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						"""

						#set word's weight
						if i != 0:
							self.__captionQueue.changeWordWeightBySeqNo(i - 1, weiWordDic[ord(firstAlph)].getWordQueue()[j].getWeightBefore())
						#print "after" + self.__config.getWeiWordDic()[ord(firstAlph)].getWordQueue()[j].getWeightAfter()
						self.__captionQueue.changeWordWeightBySeqNo(i + 1, weiWordDic[ord(firstAlph)].getWordQueue()[j].getWeightAfter())
						
						#delete the word that used to set word's weight
						self.__captionQueue.deleteWordBySeqNo(i)
						
						numOfWord -= 1
						i -= 1
						break

					j += 1
			i += 1

		#print self.__captionQueue.getWordQueue()[0].getWordContent()
		#print self.__captionQueue.getWordQueue()[0].getWordWeight()

	#set up some addtional rules to make the keyword more accurate
	def addtionalWordWei(self):		
		wordQueue = self.__captionQueue
		numOfWord = wordQueue.getCountWord()		

		#These below are the rules for special word
		###################################################################################################
		#add up 3 points for the word that all of its alpha are upper case
		patternWithAllUpperCaseAlph = re.compile(r'^[A-Z,\W]+$')
		#add up 3 points for the word with punctuation
		patternWithPunctuation = re.compile(r'\W')
		#add up -1 point for the word only with number
		patternWithAllNumber = re.compile(r'\d+$')
		#add up 3 points for the word with number
		pattrenWithNumber = re.compile(r'\d')
		###################################################################################################

		def setAddtionalWei(i, pattern, point):
			if pattern.search(wordQueue.getWordQueue()[i].getWordContent()):
				wordQueue.changeWordWeightBySeqNo(i, point)

		i = 0

		while i < numOfWord:
			setAddtionalWei(i, patternWithAllUpperCaseAlph, 3)
			setAddtionalWei(i, patternWithPunctuation, 3)
			setAddtionalWei(i, patternWithAllNumber, -1)
			setAddtionalWei(i, pattrenWithNumber, 3)

			i += 1
	
	#Quick sort method shows below:
	########################################################################################################
	def partition(self, low, high):
		#use the first word's Wei as a pivot
		pivot = self.__captionQueue.getWordQueue()[low].getWordWeight()
		temp = self.__captionQueue.getWordQueue()[low]

		while low < high:
			#front to back, find out the Wei of word that less than pivot 
			while low < high and self.__captionQueue.getWordQueue()[high].getWordWeight() <= pivot:
				high -= 1
			
			if low < high:
				#change two words
				self.__captionQueue.getWordQueue()[low] = self.__captionQueue.getWordQueue()[high]
				low += 1

			#back to front, find out the Wei of word that more than pivot
			while low < high and self.__captionQueue.getWordQueue()[low].getWordWeight() >= pivot:
				low += 1

			if low < high:
				#change two words
				self.__captionQueue.getWordQueue()[high] = self.__captionQueue.getWordQueue()[low]
				high -= 1
		
		#move the less one to the temp's position
		self.__captionQueue.getWordQueue()[low] = temp
	
		#return the position
		return low

	def QuickSort(self, low, high):
		if (low < high):
			n = self.partition(low, high)
			self.QuickSort(low, n)
			self.QuickSort(n+1, high)

	def rankWordByScore(self):
		high = self.__captionQueue.getCountQueue() - 1
		low = 0
		self.QuickSort(low, high)
	
	#if one word' wei is more than 2 point, means it's a keyword. Besides, the number of keyword should be more than five.
	def filterWord(self):
		if self.__captionQueue.getCountWord() <= 5:
			for data in self.__captionQueue.getWordQueue():				
				isSame = False
				if self.__filteredKeywordList == []:
					self.__filteredKeyword = Keyword(data.getWordContent(), data.getWordWeight())
					self.__filteredKeywordList.append(self.__filteredKeyword)
				else:
					#judge the word has been existed or not
					for word in self.__filteredKeywordList:
						if data.getWordContent() == word.getContent():
							newWei = (data.getWordWeight() + word.getWeight()) / 2
							word.setWeight(newWei)
							isSame = True
							break
					if isSame == False:
						self.__filteredKeyword = Keyword(data.getWordContent(), data.getWordWeight())
						self.__filteredKeywordList.append(self.__filteredKeyword)
		else:
			for data in self.__captionQueue.getWordQueue():
				#data.getWordContent()
				#print data.getWordWeight()

				if data.getWordWeight() >= 2.0:
					#judge the word has been existed or not
					isSame = False
					if self.__filteredKeywordList == []:
						self.__filteredKeyword = Keyword(data.getWordContent(), data.getWordWeight())
						self.__filteredKeywordList.append(self.__filteredKeyword)
					else:
						for word in self.__filteredKeywordList:
							#print word.getContent()
							#print data.getWordContent()
							#pause = raw_input("pause")
							if data.getWordContent().lower() == word.getContent().lower():
								newWei = (data.getWordWeight() + word.getWeight()) / 1.1
								word.setWeight(newWei)
								isSame = True
								break
						if isSame == False:
							self.__filteredKeyword = Keyword(data.getWordContent(), data.getWordWeight())
							self.__filteredKeywordList.append(self.__filteredKeyword)

				
					#print "---------"
					#print self.__filteredKeywordList


	def getFilteredKeywordList(self):
		return self.__filteredKeywordList
	########################################################################################################

	def showWordContent(self):
		for data in self.__captionQueue.getWordQueue():
			print data.getWordContent()
			print data.getWordWeight()

	def showFilterWord(self):
		for data in self.__filteredKeywordList:
			print data.getContent()
			print data.getWeight()